
### CURRENT EFFORTS IN May 17, 2017

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

aa0=pd.read_csv('/Kaggle Russian House/train.csv',sep=',',header=0)
aa02=pd.read_csv('/Kaggle Russian House/macro.csv',sep=',',header=0)

## MERGE DATABASES
df=pd.merge(aa0, aa02, how='left', on='timestamp')
df.columns.get_loc("price_doc")
df=shuffle(df)

## DELETE VARIABLES WITH MORE THAN 15% OF MISSING VALUES
miss=[df.ix[:,i].isnull().sum()/df.shape[0] for i in range(0,df.shape[1])]
delete=np.where(np.array(miss)>.15)[0]

df2=df.drop(df.columns[delete], axis=1)

## TURN NOMINAL DATA INTO NUMERICAL VALUES
categoricals=[]
for i in range(0,df2.shape[1]):
    if df2.ix[:,i].dtype=='object':
        categoricals.append(1)
    else:
        categoricals.append(0)
        
for i in np.where(np.array(categoricals)==1)[0]:
    df2.ix[:,i] = pd.Categorical(df2.ix[:,i])
    df2.ix[:,i] = df2.ix[:,i].cat.codes

## REPLACE MISSING VALUES BY MEAN
for i in range(0,df2.shape[1]):
    df2.ix[:,i]=df2.ix[:,i].fillna(df2.ix[:,i].mean())

## CHECK CORRELATIONS
ab=df2.corr()

df2.columns.get_loc("price_doc")
part0=np.where(np.array(ab[[259]])>.28)[0]
range0=np.delete(range(0,df2.shape[1]),259)

## FEATURE ENGINEERING
for i in part0[0:4]:
    for j in range0:
        df2[str('engineer'+str(i)+'x'+str(j))]=df2.ix[:,i]*df2.ix[:,j]

## CHECK CORRELATIONS
st2=[]
for i in range(0,df2.shape[1]):
    st2.append(np.corrcoef(df2.ix[:,i],df2['price_doc'])[0][1])

## PICK VARIABLES WITH CORRELATION BIGGER THAN .4
p2=np.where(np.array(st2)>.4)[0]
df2.columns.get_loc("price_doc")
best=p2[1:len(p2)]
df24=df2

X_train0=df24.ix[:,best].astype(np.float64)
Y_train0=df24.ix[:,df2.columns.get_loc("price_doc")].astype(np.float64)

## REMOVE OUTLIERS 98% CONFIDENCE INTERVAL
thre=2.33
out1=np.where(Y_train0>thre*np.std(Y_train0))[0]
out=[]
for i in range(0,X_train0.shape[1]):
    out.append(np.where(X_train0.ix[:,i]>thre*np.std(X_train0.ix[:,i]))[0])
out2=np.concatenate(out,axis=0)
delete=np.unique(np.concatenate((out1,out2),axis=0))
len(np.unique(np.concatenate(out,axis=0)))


X_train1=X_train0.drop(delete)
Y_train1=Y_train0.drop(delete)

## PREPARE TRAINING SET AND VALIDATION SET
end=int(Y_train1.shape[0]*.8)

## NORMALIZATION
def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

## APPLY LOG
for i in range(0,X_train1.shape[1]):
    X_train1.ix[:,i]=X_train1.ix[:,i].replace(0,0.000001)
    X_train1.ix[:,i]=norm(np.log10(X_train1.ix[:,i]))

Y_train1=Y_train1.replace(0,0.000001)
Y_train1=np.log10(Y_train1)
X_train=X_train1[0:end]
Y_train=Y_train1[0:end]
X_test0=X_train1[end+1:len(Y_train1)]
Y_test=Y_train1[end+1:len(Y_train1)]

from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(loss='ls', \
learning_rate=0.73, n_estimators=100, subsample=1, 
min_samples_split=100, min_samples_leaf=30, \
min_weight_fraction_leaf=0.0, max_depth=150,\
 init=None, random_state=None, max_features=X_train.shape[1], alpha=0.9, \
 verbose=1, max_leaf_nodes=None, warm_start=False,)

model.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1           0.0092            4.21m
         2           0.0049            4.15m
         3           0.0046            4.03m
         4           0.0046            3.85m
         5           0.0046            3.60m
         6           0.0046            3.34m
         7           0.0046            3.14m
         8           0.0046            2.99m
         9           0.0046            2.87m
        10           0.0046            2.74m
        20           0.0046            1.97m
        30           0.0046            1.71m
        40           0.0046            1.47m
        50           0.0046            1.22m
        60           0.0046           58.73s
        70           0.0046           44.09s
        80           0.0046           29.38s
        90           0.0046           14.66s
       100           0.0046            0.00s

## CHECK FITNESS IN TRAINING SET AND VALIDATION SET
pred0=model.predict(X_train)

error=np.mean(abs(pred0-Y_train))/len(Y_train)
p1=np.log(pred0+1)
r1=np.log(Y_train+1)
where_are_NaNsp1 = np.isnan(p1)
where_are_NaNsr1 = np.isnan(r1)
p1[where_are_NaNsp1] = 0.003
r1[where_are_NaNsr1] = 0.003

RMSLE=np.sqrt(np.sum((p1-r1)**2)/len(Y_train))

pred02=model.predict(X_test0)

error2=np.mean(abs(pred02-Y_test))/len(Y_test)
p12=np.log(pred02+1)
r12=np.log(Y_test+1)
where_are_NaNsp12 = np.isnan(p12)
where_are_NaNsr12 = np.isnan(r12)
p12[where_are_NaNsp12] = 0.003
r12[where_are_NaNsr12] = 0.003

RMSLE2=np.sqrt(np.sum((p12-r12)**2)/len(Y_train))
print('RMSLE train=',RMSLE,'error train=',error)
print('RMSLE test=',RMSLE2,'error test=',error2)

## RESULTS
RMSLE train= 0.000226818195068 error train= 2.219159781896118e-08
RMSLE test= 0.0187713762296 error test= 3.144649708793386e-05

plt.plot(np.sort(pred0))
plt.plot(np.sort(Y_train))
plt.show()

## WORK IN TEST SET
ab2=pd.read_csv('/Volumes/16 DOS/Python/Kaggle Russian House/RussianHouse_test.csv',sep=',',header=0)
aa02=pd.read_csv('/Volumes/16 DOS/Python/Kaggle Russian House/Selected Best/macro.csv',sep=',',header=0)
ab2.shape

df3=pd.merge(ab2, aa02, how='left', on='timestamp')
df=df3

## REMOVE VARIABLES WITH MISSING VALUES GREATER THAN 15%
miss=[df.ix[:,i].isnull().sum()/df.shape[0] for i in range(0,df.shape[1])]
delete=np.where(np.array(miss)>.15)[0]

df2=df.drop(df.columns[delete], axis=1)

## TURN NOMINAL DATA INTO NUMERICAL VALUES
categoricals=[]
for i in range(0,df2.shape[1]):
    if df2.ix[:,i].dtype=='object':
        categoricals.append(1)
    else:
        categoricals.append(0)
        
for i in np.where(np.array(categoricals)==1)[0]:
    df2.ix[:,i] = pd.Categorical(df2.ix[:,i])
    df2.ix[:,i] = df2.ix[:,i].cat.codes

## REPLACE MISSING VALUES BY MEAN
for i in range(0,df2.shape[1]):
    df2.ix[:,i]=df2.ix[:,i].fillna(df2.ix[:,i].mean())

## FEATURE ENGINEERING
for i in part0[0:4]:
    for j in range0:
        df2[str('engineer'+str(i)+'x'+str(j))]=df2.ix[:,i]*df2.ix[:,j]
df25=df2

usar=X_train.columns.values

X_test=df25.ix[:,usar]
for i in range(0,X_test.shape[1]):
    X_test.ix[:,i]=X_test.ix[:,i].replace(0,0.000001)
    X_test.ix[:,i]=norm(np.log10(X_test.ix[:,i]))


plt.plot(np.sort(X_test.ix[:,0]))
plt.plot(np.sort(X_test.ix[:,11]))

pred=10**model.predict(X_test)
id0=df3.ix[:,0]

cc=np.array([np.array(id0),pred]).T

## SEND TO KAGGLE
np.savetxt("/Kaggle Russian House/COMPLETO_Norm.csv", cc, delimiter=',', header='id,price_doc', fmt = '%10i, %10f', comments='')
